#!/bin/bash

model_path="/data/nfs-ten1/nfs/meichaoyang001/model/Qwen1.5-14B-Chat_240417"
mkdir -p /data/nfs-ten12/nfs && mount 10.26.192.5:/disk/vdb4/nfs-ten12 /data/nfs-ten12/nfs

export WANDB_API_KEY=9afc62359e50f5d0b24fee88ce7ce8d162e998ed
export WANDB_DISABLED=true




echo "配置环境变量"
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_GID_INDEX=3
export NCCL_IB_DISABLE=0
export NCCL_IB_HCA=mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3,mlx5_bond_4,mlx5_bond_5,mlx5_bond_6,mlx5_bond_7
export NCCL_NET_GDR_LEVEL=2
export NCCL_IB_QPS_PER_CONNECTION=4
export NCCL_IB_TC=160
export NCCL_IB_TIMEOUT=22

# export NCCL_DEBUG=INFO

MASTER_IP=$(cat /etc/aistudio/masteraddr)
apt-get install pdsh -y
export PDSH_SSH_ARGS_APPEND=""

eval "$(conda shell.bash hook)"
conda activate /data/nfs-ten1/nfs/meichaoyang001/envs/llama_240508_cuda12_2

echo "启动训练"



deepspeed --hostfile /etc/aistudio/hostfile --master_addr $MASTER_IP --ssh_port 20023 src/train.py \
    --deepspeed conf/ds_stage3_config_qwen_no_offload.json \
    --stage sft \
    --model_name_or_path $model_path \
    --do_train \
    --dataset 240402_gpt4_118683_shuffle \
    --preprocessing_num_workers 16 \
    --template qwen \
    --finetuning_type full \
    --output_dir /data/nfs-ten12/nfs/meichaoyang001/checkpoint/qwen_1.5_14b_chat_full_sft_240402_gpt4_118683_shuffle_bs128_lr2e-5_2node \
    --overwrite_cache \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 8 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --warmup_steps 100 \
    --save_steps 2000 \
    --save_only_model \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --plot_loss \
    --bf16 \
    --ignore_len 10240 \
    --cutoff_len 102400 \
    --flash_attn fa2 \
    --use_fast_tokenizer true


